\documentclass{article}[11pt]
\usepackage{sectsty}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{amsmath, amsthm, amssymb}
\usepackage[usenames,dvipsnames]{color}

\newcommand{\bW} { \bm{W} }
\newcommand{\bU} { \bm{U} }
\newcommand{\bb} { \bm{b} }
\newcommand{\bz} { \bm{z} }
\newcommand{\bmf} { \bm{f} }
\newcommand{\bx} { \bm{x} }
\newcommand{\by} { \bm{y} }
\newcommand{\yhat} { \bm{\hat{y}} }
\newcommand{\btheta} { \bm{\theta} }
\newcommand{\bg} { \bm{g} }
\newcommand{\bh} { \bm{h} }
\newcommand{\bL} { \bm{L} }
\newcommand{\bI} { \bm{I} }
\newcommand{\bdelta} { \bm{\delta} }
\newcommand{\smx} { \text{softmax} }
\newcommand{\relu} { \text{ReLU} }
\newcommand{\sgn} { \text{sgn} }
\newcommand{\diag} { \text{diag} }

\newcommand{\todo}[1] { \color{red}[TODO: #1]\color{black} }


\newcommand{\alns}[1] {
	\begin{align*} #1 \end{align*}
}
\newcommand{\pd}[2] {
 \frac{\partial #1}{\partial #2}
}

\setlength{\parindent}{0pt}
\sectionfont{\fontsize{12}{12}\selectfont}
\subsectionfont{\fontsize{11}{0}{\vspace{-10pt}}\selectfont}
\title{Computing Neural Network Gradients}
\date{}
\author{Kevin Clark}

\begin{document}
\maketitle
\vspace{-5mm}


\section{Introduction}
The purpose of these notes is to demonstrate how to quickly compute neural network gradients. This will hopefully help you with question 3 of Assignment 2 (if you haven't already done it) and with the midterm (which will have at least one significant gradient computation question). It is \textbf{not} meant to provide an intuition for how backpropagation works -- for that I recommend going over lecture 5\footnote{http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture5.pdf} and the cs231 course notes\footnote{http://cs231n.github.io/optimization-2/} on backpropagation.
\section{Vectorized Gradients}
While it is a good exercise to compute the gradient of a neural network with respect to a single parameter (e.g., a single element in a weight matrix), in practice this tends to be quite slow. Instead, it is more efficient to keep everything in matrix/vector form. The basic building block of vectorized gradients is the {\it Jacobian Matrix}. Suppose we have a function $\bmf: \mathbb{R}^n \to \mathbb{R}^m$ that maps a vector of length $n$ to a vector of length $m$: $\bmf(\bx) = [f_1(x_1, ..., x_n), f_2(x_1, ..., x_n), ..., f_m(x_1, ..., x_n)]$. Then its Jacobian is.

\alns{
	\pd{\bmf}{\bx} = \begin{bmatrix}
		\pd{f_1}{x_1}  &  \dots   & \pd{f_1}{x_n} \\
		\vdots             & \ddots  & \vdots            \\
		\pd{f_m}{x_1} & \dots    & \pd{f_m}{x_n}            \\
	\end{bmatrix}
}
That is, $( \pd{\bmf}{\bx} )_{ij} = \pd{f_i}{x_j}$ (which is just a standard non-vector derivative). The Jacobian matrix will be useful for us because we can apply the chain rule to a vector-valued function just by multiplying Jacobians. \\

As a little illustration of this, suppose we have a function $\bmf(x) = [f_1(x), f_2(x)]$ taking a scalar to a vector of size 2 and a function $\bg(\by) = [g_1(y_1, y_2), g_2(y_1, y_2)]$ taking a vector of size two to a vector of size two. Now let's compose them to get $\bg(x) = [g_1(f_1(x), f_2(x)), g_2(f_1(x), f_2(x))]$. Using the regular chain rule, we can compute the derivative of $\bg$ as the Jacobian
\alns{
	\pd{\bg}{x} = \begin{bmatrix}
		\pd{}{x}g_1(f_1(x), f_2(x))  \\
		\pd{}{x}g_2(f_1(x), f_2(x)) \\
	\end{bmatrix} =  \begin{bmatrix}
		\pd{g_1}{f_1}\pd{f_1}{x} + \pd{g_1}{f_2}\pd{f_2}{x}   \\
		\pd{g_2}{f_1}\pd{f_1}{x} + \pd{g_2}{f_2}\pd{f_2}{x}  \\
	\end{bmatrix}
}
And we see this is the same as multiplying the two Jacobians:
\alns{
\pd{\bg}{x} = \pd{\bg}{\bmf}\pd{\bmf}{x} = \begin{bmatrix}
	\pd{g_1}{f_1} & \pd{g_1}{f_2} \\
	\pd{g_2}{f_1} & \pd{g_2}{f_2} \\
\end{bmatrix}
\begin{bmatrix}
	\pd{f_1}{x} \\
	\pd{f_2}{x} \\
\end{bmatrix}
}

\section{Useful Identities}
This section will now go over how to compute the Jacobian for several simple functions. It will provide some useful identities you can apply when taking neural network gradients. \\

\begin{enumerate}[(1)]

%1. ------------------------------------------------------------------

\item \textbf{Matrix times column vector with respect to the column vector} ($\bz = \bW \bx$, what is $\pd{\bz}{\bx}$?) \\

Suppose $\bW \in \mathbb{R}^{n \times m}$. Then we can think of $\bz$ as a function of $\bx$ taking an $m$-dimensional vector to an $n$-dimensional vector. So its Jacobian will be $n \times m$. Note that
\alns{
	z_i &= \sum_{k=1}^m  W_{ik} x_{k}
}
So an entry $(\pd{\bz}{\bx})_{ij}$ of the Jacobian will be
\alns{
	(\pd{\bz}{\bx})_{ij} = \pd{z_i}{x_j} &=  \pd{}{x_j}\sum_{k=1}^m  W_{ik} x_{k} = \sum_{k=1}^m  W_{ik} \pd{}{x_j}x_{k} = W_{ij}
}
because $\pd{}{x_j}x_{k} = 1$ if $k = j$ and 0 if otherwise. So we see that $\boxed{\pd{\bz}{\bx} = \bW}$

%2. ------------------------------------------------------------------

\item \textbf{Row vector times matrix with respect to the row vector} \\ ($\bz =  \bx \bW$, what is $\pd{\bz}{\bx}$?) \\

%Similarly to (1), we have
%\alns{
%	z_i &= \sum_{k=1}^m  W_{ki} x_{k}
%}
%so with a computation similar to (1) we get
%\alns{
%	\pd{z_i}{x_j} = W_{ji}
%}
%which means that $\boxed{\pd{\bz}{\bx} = \bW^T}$
A computation similar to (1) shows that $\boxed{\pd{\bz}{\bx} = \bW^T}$.

\item \textbf{A vector with itself}\\($\bz = \bx$, what is $\pd{\bz}{\bx}$? ) \\
We have $z_i = x_i$. So
\alns{
(\pd{\bz}{\bx})_{ij} = \pd{z_i}{x_j} = \pd{}{x_j}x_i = \begin{cases}
	1 \phantom{abc} \text{if $i = j$} \\
	0 \phantom{abc} \text{if otherwise}
	\end{cases}
}
So we see that the Jacobian $\pd{\bz}{\bx}$ is a diagonal matrix where the entry at $(i, i)$ is 1. This is just the identity matrix: $\boxed{\pd{\bz}{\bx} = \bI}$. When applying the chain rule, this term will disappear because a matrix multiplied by the identity matrix does not change.

\item \textbf{An elementwise function applied a vector}\\ ($\bz = f(\bx)$, what is $\pd{\bz}{\bx}$? ) \\
If $f$ is being applied elementwise, we have $z_i = f(x_i)$. So
\alns{
	(\pd{\bz}{\bx})_{ij} = \pd{z_i}{x_j} = \pd{}{x_j}f(x_i) = \begin{cases}
	f'(x_i) \phantom{abc} \text{if $i = j$} \\
	0 \phantom{abc} \text{if otherwise}
	\end{cases}
}
So we see that the Jacobian $\pd{\bz}{\bx}$ is a diagonal matrix where the entry at $(i, i)$ is the derivative of $f$ applied to $x_i$. We can write this as $\boxed{\pd{\bz}{\bx} = \diag(f'(\bx))}$. Since multiplication by a diagonal matrix is the same as doing elementwise multiplication by the diagonal, we could also write $\boxed{\circ f'(\bx)}$ when applying the chain rule.

\item \textbf{Matrix times column vector with respect to the matrix} \\ ($\bz = \bW \bx$, $\bdelta = \pd{J}{\bz}$ what is $\pd{J}{\bW} = \pd{J}{\bz} \pd{\bz}{\bW} = \bdelta \pd{\bz}{\bW}$?) \\

This is a bit more complicated than the other identities. The reason for including $\pd{J}{\bz}$% = \bdelta$
in the above problem formulation will become clear in a moment.

First suppose we have a loss function $J$ (a scalar) and are computing its gradient with respect to a matrix $\bW \in \mathbb{R}^{n \times m}$. Then we could think of $J$ as a function of $\bW$ taking $nm$ inputs (the entries of $\bW$) to a single output ($J$). This means the Jacobian $\pd{J}{\bW}$ would be a $1 \times nm$ vector. But in practice this is not a very useful way of arranging the gradient. It would be much nicer if the derivatives were in a $n \times m$ matrix like this:
\alns{
	\pd{J}{\bW} = \begin{bmatrix}
		\pd{J}{W_{11}}  &  \dots   & \pd{J}{W_{1m}} \\
		\vdots             & \ddots  & \vdots            \\
		\pd{J}{W_{n1}} & \dots    & \pd{J}{W_{nm}}            \\
	\end{bmatrix}
}
Since this matrix has the same shape as $\bW$, we could just subtract it (times the learning rate) from $\bW$ when doing gradient descent. So (in a slight abuse of notation) let's find this matrix as $\pd{J}{\bW}$ instead.

%This way of arranging the gradients becomes complicated when computing $\pd{\bz}{\bW}$. Unlike $J$, $\bz$ is vector-valued. So if we are trying to rearrange the gradients like with $\pd{J}{\bW}$, $\pd{\bz}{\bW}$ would become an $n \times m \times m$ tensor!

I think the easiest way of computing this matrix is by finding the gradient for a single weight $W_{ij}$. We have
\alns{
	z_k &= \sum_{l=1}^m  W_{kl} x_{l} \\
	\pd{z_k}{W_{ij}} &=  \sum_{l=1}^m  x_l \pd{}{W_{ij}}W_{kl}
}
Note that $\pd{}{W_{ij}}W_{kl} = 1$ if $i = k$ and $j = l$ and 0 if otherwise. So if $k \neq i$ everything in the sum is zero and the gradient is zero. Otherwise, the only nonzero element of the sum is when $l = j$, so we just get $x_j$. Thus we find $\pd{z_k}{W_{ij}} = x_j$ if $k = i$ and 0 if otherwise. Another way of writing this is
 \alns{
 	\pd{\bz}{W_{ij}} = \begin{bmatrix} 0 \\ \vdots \\ 0 \\ x_j \\ 0 \\ \vdots \\ 0 \end{bmatrix}
	\gets \text{$i$th element}
}
Now let's compute $\pd{J}{W_{ij}}$
\alns{
	\pd{J}{W_{ij}} = \pd{J}{\bz}\pd{\bz}{W_{ij}} = \bdelta \pd{\bz}{W_{ij}} = \sum_{k=1}^m \delta_k \pd{z_k}{W_{ij}} = \delta_i x_j
}
(the only nonzero term in the sum is $\delta_i \pd{z_i}{W_{ij}}$). To get $\pd{J}{\bW}$ we want a matrix where entry $(i, j)$ is $\delta_i x_j$. This matrix is equal to the outer project $\boxed{\pd{J}{\bW} = \bdelta^T \bx}$

\item \textbf{Row vector time matrix with respect to the matrix} \\ ($\bz = \bx \bW$, $\bdelta = \pd{J}{\bz}$ what is $\pd{J}{\bW} = \bdelta \pd{\bz}{\bW}$?) \\
A similar computation to (5) shows that $\boxed{\pd{J}{\bW} = \bx^T  \bdelta }$.

\item \textbf{Cross-entropy loss with respect to logits} ($\yhat = \text{softmax}(\btheta)$, $J = CE(\by, \yhat)$, what is $\pd{J}{\btheta}$?) \\

You did this in Assignment 1! The gradient is $\boxed{\pd{J}{\btheta} = \yhat - \by}$

%I think the easiest way of dealing with this is avoiding the issue by taking the derivative with respect to a single row of $\bW$ $\bW_i$. Then the Jacobian $\pd{z_i}{\bW_{i}}$ will be a matrix instead of a tensor.
%\alns{
%	z_i &= \sum_{k=1}^m  W_{ik} x_{k} \\
%	\pd{z_i}{W_{ij}} &= x_j% \to \pd{z_i}{\bW_{i}} = \bx
%}
%Since entry $i, j$ of the Jacobian is $x_j$ we have
%\alns{
%	\pd{\bz}{\bW_i} =  \begin{bmatrix} \bx \\ \bx \\ \vdots \\ \bx \end{bmatrix}
%}
%Now let's think about $\pd{J}{\bW_i}$

% vector times another vector vector element wise multiplied with another vector norm of a vector with respect to the vector

\end{enumerate}

These identities will be enough to let you quickly compute the gradients for many neural networks. However, it's important to know how to compute Jacobians for other functions as well in case they show up. Some examples if you want practice: dot product of two vectors, elementwise product of two vectors, 2-norm of a vector. Feel free to use these identities on the midterm and assignments.

\section{Example: 1-Layer Neural Network with Embeddings}
This section provides an example of computing the gradients of a full neural network.
In particular we are going to compute the gradients of the dependency parser you are building in Assignment 2. First let's write out the forward pass of the model.
\alns{
	\bx &= [\bL_{w_0}, \bL_{w_1}, ..., \bL_{w_{m - 1} }]  \\
	\bz &= \bx \bW + \bb_1 \\
	\bh &= \relu(\bz) \\
	\btheta &= \bh \bU + \bb_2 \\
	\yhat &= \smx(\btheta) \\
	J &= CE(\by, \yhat)
}
It helps to break up the model into the simplest parts possible, so note that we defined $\bz$ and $\btheta$ to split up the activation functions from the linear transformations in the network's layers. The dimensions of the model's parameters are
\[
\bL \in \mathbb{R}^{|V| \times d} \quad\quad
\bb_1 \in \mathbb{R}^{1 \times D_h} \quad\quad
\bW \in \mathbb{R}^{md \times D_h} \quad\quad
\bb_2 \in \mathbb{R}^{1 \times N_c} \quad\quad
\bU \in \mathbb{R}^{D_h \times N_c} \quad\quad
\]
where $|V|$ is the vocabulary size, $d$ is the size of our word vectors, $m$ is the number of features, $D_h$ is the size of our hidden layer, and $N_c$ is the number of classes. \\

In this example, we will compute all the gradients:

\[
\frac{\partial J}{\partial \bU} \quad\quad
\frac{\partial J}{\partial \bb_2} \quad\quad
\frac{\partial J}{\partial \bW} \quad\quad
\frac{\partial J}{\partial \bb_1} \quad\quad
\frac{\partial J}{\partial \bL_{w_i}} \quad\quad
\]

To start with, recall that $\relu(x) = \max(x, 0)$. This means
\begin{align*}
\relu'(x) = \begin{cases}
	1 \phantom{abc} \text{if $x > 0$} \\
	0 \phantom{abc} \text{if otherwise}
	\end{cases}
	= \sgn(\relu(x))
\end{align*}
where $\sgn$ is the signum function. Note that as you did in Assignment 1 with sigmoid, we are able to write the derivative of the activation in terms of the activation itself. \\

Now let's write out the chain rule for $\pd{J}{\bU}$ and $\pd{J}{\bb_2}$:
\alns{
	\pd{J}{\bU} &= \pd{J}{\yhat}\pd{\yhat}{\btheta}\pd{\btheta}{\bU} \\
	\pd{J}{\bb_2} &= \pd{J}{\yhat}\pd{\yhat}{\btheta}\pd{\btheta}{\bb_2}
}
Notice that $\pd{J}{\yhat}\pd{\yhat}{\btheta} = \pd{J}{\btheta}$ is present in both gradients. This makes the math a bit cumbersome. Even worse, if we're implementing the model without automatic differentiation, computing $\pd{J}{\btheta}$ twice will be inefficient. So it will help us to define some variables to represent the intermediate derivatives:
\alns{
	\bdelta_1 = \pd{J}{\btheta} \quad\quad
	\bdelta_2 = \pd{J}{\bz}
}
We can compute them as follows:
\alns{
\bdelta_1 &= \pd{J}{\btheta} = \yhat - \by & \text{this is just identity (7)}\\
\bdelta_2 &= \pd{J}{\bz} = \pd{J}{\btheta}\pd{\btheta}{\bh}\pd{\bh}{\bz} & \text{using the chain rule}\\
&= \bdelta_1\pd{\btheta}{\bh}\pd{\bh}{\bz} & \text{substituting in $\bdelta_1$}\\
&= \bdelta_1\ \bU^T \pd{\bh}{\bz} & \text{using identity (2)} \\
&= \bdelta_1\ \bU^T \circ \relu'(\bz) & \text{using identity (4)} \\
&= \bdelta_1\ \bU^T \circ \sgn(\bh) & \text{we computed this earlier} \\
}
A good way of checking our work is by looking at the dimensions of the terms in the derivative:
\alns{
&\pd{J}{\bz} \qquad = \qquad \bdelta_1 \qquad\qquad\qquad \bU^T \qquad \circ \qquad \sgn(\bh) \\
(1 &\times D_h) \hspace{9mm} (1 \times N_c) \hspace{10mm} (N_c \times D_h) \hspace{12mm} (D_h)
}
We see that the dimensions of all the terms in the gradient match up (i.e., the number of columns in a term equals the number of rows in the next term).
This will always be the case if we computed our gradients correctly. \\

%\todo{something about checking dimensions}
 Now we can use the error terms to compute our gradients:
\alns{
	\pd{J}{\bU} &= \pd{J}{\btheta}\pd{\btheta}{\bU} = \bdelta_1\pd{\btheta}{\bU} = \bh^T \bdelta_1 & \text{using identity (6)} \\
	\pd{J}{\bb_2} &= \pd{J}{\btheta}\pd{\btheta}{\bb_2} = \bdelta_1\pd{\btheta}{\bb_2} =\bdelta_1 & \text{using identity (3)} \\
	\pd{J}{\bW} &= \pd{J}{\btheta}\pd{\bz}{\bW} = \bdelta_2\pd{\bz}{\bW} = \bx^T \bdelta_2 & \text{using identity (6)} \\
	\pd{J}{\bb_1} &= \pd{J}{\btheta}\pd{\bz}{\bb_1} = \bdelta_2\pd{\bz}{\bb_1} = \bdelta_2 & \text{using identity (3)} \\
	\pd{J}{\bL_{w_i}} &= \pd{J}{\bz}\pd{\bz}{\bL_{w_i}} = \bdelta_2 \pd{\bz}{\bL_{w_i}}
	%\pd{J}{\bL_{w_i}} &= \pd{J}{\btheta}\pd{\bz}{\bx}\pd{\bx}{\bL_{w_i}} = \bdelta_2 \bW^T\pd{\bx}{\bL_{w_i}} & \text{Using identity (2)}
}
All that's left is to compute $\pd{\bz}{\bL_{w_i}}$. It helps to split up $\bW$ by rows like this:
\alns{
	\bx \bW &= [\bL_{w_0}, \bL_{w_1}, ..., \bL_{w_{m - 1} }] \bW
	= [\bL_{w_0}, \bL_{w_1}, ..., \bL_{w_{m - 1} }]
	\begin{bmatrix} \bW_{0:d} \\ \bW_{d:2d} \\ \vdots \\ \bW_{(m - 1)d:md} \end{bmatrix} \\
	&= \bL_{w_0} \bW_{0:d}  + \bL_{w_1} \bW_{d:2d} + \dots + \bL_{w_{m - 1}}\bW_{(m - 1)d:md}
	=\sum_{j = 0}^{m - 1} \bL_{w_j} \bW_{dj:d(j + 1)}
}
When we compute $\pd{\bz}{\bL_{w_i}}$, only the $i$th term in this sum is nonzero, so we get
\alns{
	\pd{\bz}{\bL_{w_i}} = \pd{}{\bL_{w_i}} =  \bL_{w_i } \bW_{di:d(i + 1)} =  (\bW_{di:d(i + 1)})^T
%	= (\bW^T)_{\cdot, di:d(i + 1)}
}
using identity (2).



\end{document}
